#!/bin/sh
+read _DATE _date_n <<-EOF
+ $(date +"%s %N")
+ EOF
+
. "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
. "${_EXEC:-${0%/*}}/cgilite/storage.sh"
. "${_EXEC:-${0%/*}}/cgilite/json.sh"
-[ "$_DATE" ] || _DATE="$(date +%s)"
-
-_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
-_records="${_DATA}/${_INDEX}/_0_DOCS"
+debug "$REQUEST_METHOD $REQUEST_URI $SERVER_PROTOCOL $_DATE"
ingest() {
- local J="$1"
+ local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip"
# json_get "$J" title
# json_get "$J" parts.comments
case $(json_get "$J" title) in
*.md|*.txt|*.csv)
- json_get "$J" content |base64 -d
+ printf %s "$content" |base64 -d
;;
*.pdf)
- json_get "$J" content |base64 -d \
- | pdftotext -
+ printf %s "$content" |base64 -d \
+ | pdftotext - -
;;
*.doc)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catdoc /dev/stdin
;;
*.xls)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| xls2csv /dev/stdin
;;
*.ppt)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catppt /dev/stdin
;;
*.html|*.xml|*.svg)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| sed 's;<[^>]*>;;g'
;;
*.docx)
- json_get "$J" content |base64 -d \
- | unzip -qc /dev/stdin word/document.xml \
+ printf %s "$content" |base64 -d >"$ztmp"
+ unzip -qc "$ztmp" word/document.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
+ rm -- "$ztmp"
;;
*.xlsx)
- json_get "$J" content |base64 -d \
- | unzip -qc /dev/stdin xl/sharedStrings.xml \
+ printf %s "$content" |base64 -d >"$ztmp"
+ unzip -qc "$ztmp" xl/sharedStrings.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
+ rm -- "$ztmp"
;;
*.odt)
- json_get "$J" content |base64 -d \
- | unzip -qc /dev/stdin content.xml \
+ printf %s "$content" |base64 -d >"$ztmp"
+ unzip -qc "$ztmp" content.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
+ rm -- "$ztmp"
;;
*.ods|*.odp)
- json_get "$J" content |base64 -d \
- | unzip -qc /dev/stdin content.xml \
+ printf %s "$content" |base64 -d >"$ztmp"
+ unzip -qc "$ztmp" content.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
+ rm -- "$ztmp"
;;
*):;;
esac
}
+search() {
+ local index="$1" words w num total freq doc date J
+ shift 1; words="$@"
+
+ words="$(printf %s\\n "$words" | awk '
+ BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+ if ( length("ยก") == 1 ) # Utf-8 aware AWK
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+ else # UTF-8 Hack
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+ fi
+ }
+ { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); }
+ ')"
+
+ for w in ${words}; do
+ [ ! -f "${index}/$w" ] && continue
+
+ while read num total freq doc date; do
+ printf '%s-%i %f\n' "${doc}" "${date}" "$freq"
+ done <"${index}/$w"
+ done \
+ | awk '
+ { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
+ END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
+ for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d;
+ }
+ ' \
+ | sort -nr \
+ | while read freq doc; do
+ date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
+
+ if J="$(DBM "$_records" get "$doc")"; then
+ [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
+ && printf '%f %s %s\n' \
+ "$freq" "$(STRING "$doc")" "$(STRING "$J")"
+ fi
+ done
+}
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
+if [ "${INDEX}" -a ! -d "${_DATA}/${_INDEX}" ]; then
+ printf '%s\r\n' "Status: 404 Not Found" ""
+ exit 0
+elif authlist="$(DBM "${_DATA}/auth.db" get "${_INDEX}" )"; then
+ auth="$(HEADER Authorization)" auth="${auth#Basic }"
+ for a in $authlist deny; do
+ [ "$auth" = "$a" ] && break
+ done
+ if [ "$a" = "deny" -o ! "$auth" ]; then
+ printf '%s\r\n' "Status: 401 Unauthorized" \
+ "WWW-Authenticate: Basic realm=\"Rigid Find\"" "" \
+ | debug
+ exit 0
+ fi
+ unset a auth authlist
+fi
+
if [ "$REQUEST_METHOD" = "PUT" ]; then
_doc="${PATH_INFO#"/${_INDEX}/_doc"}"
- J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
-
- ingest "$J" \
- | "${_EXEC}/concordance.sh" \
- "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
+ J="$(head -c "${CONTENT_LENGTH:-0}")"
+ # Don't use json parser to get content field
+ # Content can be very large and the json parser is slow
+ content="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
+ s;".*$;;
+ s;\\;;g;
+ ')"
+ J="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
+ ')"
+ J="$(json_load "${J}")"
+
+ debug "Content: ${#content} bytes"
+ debug "$(json_dump "$J")"
+
+ if [ "${#content}" -gt 0 ]; then
+ ingest "$J" "$content"\
+ | "${_EXEC}/concordance.sh" \
+ "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
+ fi
J="${J#obj:}"
- J="$(DB2 "$J" delete content)"
J="$(DB2 "$J" set _indexdate num:"$_DATE")"
- if DBM "$_records" insert "$_doc" "$J"; then
+ if [ "${#content}" -eq 0 ]; then
+ printf '%s: %s\r\n' "Status" "200 OK"
+ result="updated"
+ elif DBM "$_records" insert "$_doc" "$J"; then
printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
result="created"
elif DBM "$_records" update "$_doc" "$J"; then
exit 0
fi
- sed 's;$;\r;' <<-EOF
- X-elastic-product: Elasticsearch
- content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+ cat <<-EOF
+ X-elastic-product: Elasticsearch\r
+ content-type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
{ "_index": $(json_dump str:"${_INDEX}"),
"_id": $(json_dump str:"$_doc"),
"result": "$result",
result="not_found"
fi
- sed 's;$;\r;' <<-EOF
- X-elastic-product: Elasticsearch
- content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+ cat <<-EOF
+ X-elastic-product: Elasticsearch\r
+ content-type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
{ "_index": $(json_dump str:"${_INDEX}"),
"_id": $(json_dump str:"$_doc"),
"result": "$result",
exit 0
elif [ "$REQUEST_METHOD" = "POST" ]; then
- :
-elif [ "$REQUEST_METHOD" = "HEAD" ]; then
- accept="$(HEADER Accept)"
- [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
- && ctype="${accept}" || ctype="application/json"
+ J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+ J="$(json_get "$J" query.bool.must.bool.should)"
- sed 's;$;\r;' <<-EOF
- HTTP/1.1 200 OK
- X-elastic-product: Elasticsearch
- content-type: ${ctype}
- EOF
- exit 0
+ words="$(
+ for j in $(DB2 "$J" iterate @); do
+ json_get "$(UNSTRING "$j")" match_phrase_prefix.content
+ done 2>/dev/null |tr \\n ' '
+ )"
+ debug "Search words: $words"
-elif [ "$REQUEST_METHOD" = "GET" ]; then
- accept="$(HEADER Accept)"
- [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
- && ctype="${accept}" || ctype="application/json"
+ results="@ $(
+ search "${_DATA}/${_INDEX}" $words \
+ | while read -r score id source; do
+ debug "Hit: $id $score"
+ S="$(DB2 "" set _index str:"${_INDEX}")"
+ S="$(DB2 "$S" set _id str:"$(UNSTRING "${id#/}")")"
+ S="$(DB2 "$S" set _score num:"$score")"
+ S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
+ printf 'obj:%s\t' "$(STRING "$S")"
+ done
+ )"
+ results="${results% }"
- sed 's;$;\r;' <<-EOF
- HTTP/1.1 200 OK
- X-elastic-product: Elasticsearch
- content-type: ${ctype}
+ t="$(( $(date +%s%N) - ${_DATE}${_date_n} ))"
- EOF
-
- if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
- sed 's;$;\r;' <<-EOF
- { $(json_dump str:"${_INDEX}"): {
- "aliases":{},
- "mappings": {
- "properties": {
- "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
- "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
- "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
- "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
- }
- },
- "settings": {
- "index": {
- "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
- "number_of_shards":"1",
- "provided_name": $(json_dump str:"${_INDEX}"),
- "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
- "number_of_replicas":"1",
- "uuid":"0000000000000000000000",
- "version":{"created":"8500010"}
- }
- }
+ cat <<-EOF
+ Status: 200 OK\r
+ X-elastic-product: Elasticsearch\r
+ Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
+ { "took":$((t / 1000000)),
+ "timed_out":false,
+ "hits": {
+ "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
+ "max_score": $(json_get "arr:$results" '[0]._score' 2>/dev/null || printf 0),
+ "hits": $(json_dump "arr:$results")
}
}
EOF
- else
- sed 's;$;\r;' <<-EOF
- { "name" : "head",
- "cluster_name" : "elasticsearch",
- "version" : {
- "number" : "8.12.1",
- "lucene_version" : "9.9.2",
- "minimum_wire_compatibility_version" : "7.17.0",
- "minimum_index_compatibility_version" : "7.0.0"
- },
- "tagline" : "You Know, for Search"
- }
+
+elif [ "$REQUEST_METHOD" = "HEAD" ]; then
+ accept="$(HEADER Accept)"
+ [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
+ && ctype="${accept}" || ctype="application/json"
+
+ cat <<-EOF
+ Status: 200 OK\r
+ X-elastic-product: Elasticsearch\r
+ content-type: ${ctype}\r
+ \r
EOF
- fi
exit 0
else
- printf '%s\r\n' "Status: 500 Internal Server Error" ""
+ # elif [ "$REQUEST_METHOD" = "GET" ]; then
+ cat <<-EOF
+ Status: 501 Not Implemented\r
+ X-elastic-product: Elasticsearch\r
+ content-type: text/plain\r
+ \r
+ Use the Nextcloud Elastic Search Plugin to use this service.
+ EOF
exit 0
fi